Reduce

对指定维度进行归约。

输入:
  • src_data - 输入数据的地址

  • param - 算子计算所需参数的结构体。其各成员见下述。

  • core_mask - 核掩码。

ReduceParameter定义:

 1typedef struct ReduceParameter {
 2    void** data_buffers_; // 用于存储中间计算结果
 3    int* outer_sizes_; // 处理某个规约轴时,该轴之前所有轴的元素数
 4    int* inner_sizes_; // 某个规约轴之后的所有元素数
 5    int* axis_sizes_; // 规约轴的元素数
 6    int total_num_; // 输入张量的总元素数
 7    int num_axes_; // 待规约轴的数目
 8    int mode_; // 规约模式
 9    int output_num_; // 输出张量的总元素数
10    /**该算子会根据ReduceParameter中的mode_参数选择实际规约所使用的方法。共有如下几种方法:
11       Reduce_Mean=0,
12       Reduce_Max=1,
13       Reduce_Min=2,
14       Reduce_Prod=3,
15       Reduce_Sum=4,
16       Reduce_SumSquare=5,
17       Reduce_ASum=6,
18       Reduce_L2Norm=7
19    **/
20} ReduceParameter;
输出:
  • dst_data - 输出地址。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持int8, int16, int32, fp32, fp64

  • MT7004 支持fp16, fp32, int16, int32

共享存储版本:

void i8_reduce_s(int8_t *src_data, int8_t *dst_data, ReduceParameter *param, int core_mask)
void i16_reduce_s(int16_t *src_data, half *dst_data, ReduceParameter *param, int core_mask)
void i32_reduce_s(int *src_data, float *dst_data, ReduceParameter *param, int core_mask)
void hp_reduce_s(half *src_data, half *dst_data, ReduceParameter *param, int core_mask)
void fp_reduce_s(float *src_data, float *dst_data, ReduceParameter *param, int core_mask)
void dp_reduce_s(double *src_data, double *dst_data, ReduceParameter *param, int core_mask)

C调用示例:

 1void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) {
 2    int tmp_input_shape[8];
 3    int total_num = 1;
 4    int i, j, k;
 5    for (i = 0; i < ndim; i++) {
 6        tmp_input_shape[i] = input_shape[i];
 7        total_num *= input_shape[i];
 8    }
 9    param->total_num_ = total_num;
10    int offset_size = 0;
11    for (i = 0; i < num_axes; ++i) {
12        int axis = axes[i];
13        int outer_size = 1;
14        for (j = 0; j < axis; j++) {
15            outer_size *= tmp_input_shape[j];
16        }
17        param->outer_sizes_[offset_size] = outer_size;
18        int inner_size = 1;
19        for (k = axis + 1; k < ndim; k++) {
20            inner_size *= tmp_input_shape[k];
21        }
22        param->inner_sizes_[offset_size] = inner_size;
23        param->axis_sizes_[offset_size] = tmp_input_shape[axis];
24        offset_size++;
25        tmp_input_shape[axis] = 1;
26    }
27}
28
29void TestReduceSMCFp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) {
30    int core_id = get_core_id();
31    int logic_core_id = GetLogicCoreId(core_mask, core_id);
32    int core_num = GetCoreNum(core_mask);
33    float* input = (float*)0x88000000;
34    float* output = (float*)0x98000000;
35    ReduceParameter* param = (ReduceParameter*)0xA8480000;
36    if (logic_core_id == 0) {
37        param->num_axes_ = num_axes;
38        param->mode_ = mode;
39        param->data_buffers_ = (void**)0xA8483000;
40        param->inner_sizes_ = (int*)0xA8484000;
41        param->outer_sizes_ = (int*)0xA8485000;
42        param->axis_sizes_ = (int*)0xA8486000;
43        int i;
44        for (i = 0; i < num_axes - 1; i++) {
45            param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000);
46        }
47        PackParam(param, ndim, input_shape, num_axes, axes);
48    }
49    sys_bar(0, core_num); // 初始化参数完成后进行同步
50    fp_reduce_s(input, check, param, core_mask);
51}
52
53void main(){
54    int input_shape[3] = {4, 5, 5};
55    int ndim = 3;
56    int axes[1] = {1};
57    int num_axes = 1;
58    int mode = 7;
59    int keep_dims = 1;
60    int core_mask = 0b1111;
61    TestReduceSMCFp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask);
62}

私有存储版本:

void i8_reduce_p(int8_t *src_data, int8_t *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
void i16_reduce_p(int16_t *src_data, half *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
void i32_reduce_p(int *src_data, float *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
void hp_reduce_p(half *src_data, half *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
void fp_reduce_p(float *src_data, float *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
void dp_reduce_p(double *src_data, double *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)

C调用示例:

 1void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) {
 2    int tmp_input_shape[8];
 3    int total_num = 1;
 4    int i, j, k;
 5    for (i = 0; i < ndim; i++) {
 6        tmp_input_shape[i] = input_shape[i];
 7        total_num *= input_shape[i];
 8    }
 9    param->total_num_ = total_num;
10    int offset_size = 0;
11    for (i = 0; i < num_axes; ++i) {
12        int axis = axes[i];
13        int outer_size = 1;
14        for (j = 0; j < axis; j++) {
15            outer_size *= tmp_input_shape[j];
16        }
17        param->outer_sizes_[offset_size] = outer_size;
18        int inner_size = 1;
19        for (k = axis + 1; k < ndim; k++) {
20            inner_size *= tmp_input_shape[k];
21        }
22        param->inner_sizes_[offset_size] = inner_size;
23        param->axis_sizes_[offset_size] = tmp_input_shape[axis];
24        offset_size++;
25        tmp_input_shape[axis] = 1;
26    }
27}
28
29void TestReduceL2Fp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) {
30    float* input = (float*)0x10000000; // 原始输入输出数据需分配在AM中
31    float* output = (float*)0x10010000;
32    float* tmp_input = (float*)0x88000000; // 临时输入输出空间需分配在DDR或SMC中
33    float* tmp_output = (float*)0x98000000;
34    ReduceParameter* param = (ReduceParameter*)0x10020000;
35    param->num_axes_ = num_axes;
36    param->mode_ = mode;
37    param->data_buffers_ = (void**)0x10021000;
38    param->inner_sizes_ = (int*)0x10022000;
39    param->outer_sizes_ = (int*)0x10023000;
40    param->axis_sizes_ = (int*)0x10024000;
41    int i, j;
42    for (i = 0; i < ndim; i++) {
43        int reduce_axis = 0;
44        for (j = 0; j < num_axes; j++) {
45            if (axes[j] == i) {
46                reduce_axis = 1;
47                break;
48            }
49        }
50        if (!reduce_axis) {
51            length *= input_shape[i];
52        }
53    }
54    for (i = 0; i < num_axes - 1; i++) {
55        param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000); // 每一个中间计算结果空间都需分配在DDR或SMC中
56    }
57    param->output_num_ = length;
58    PackParam(param, ndim, input_shape, num_axes, axes);
59    fp_reduce_p(input, check, param, core_mask);
60}
61
62void main() {
63    int input_shape[3] = {4, 5, 5};
64    int ndim = 3;
65    int axes[1] = {1};
66    int num_axes = 1;
67    int mode = 7;
68    int keep_dims = 1;
69    int core_mask = 0b0001; // 私有存储版本只能设置为一个核心启动
70    TestReduceL2Fp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask);
71}